# Import the necessary libraries
import numpy as np # Provides single and multi-dimensional arrays
import pandas as pd # Provides DataFrames for easy data manipulations and statstical functions
import matplotlib.pyplot as plt # Provides several functions for plotting various graphs
% matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
# Read vehicle data into a dataframe
orig_vehicle_df = pd.read_csv('vehicle-1.csv')
orig_vehicle_df.shape
# All the rows & features from data file are read into the dataframe
# Make a copy of the original data so we dont have to load data again if we need to rollback our changes later
vehicle_df = orig_vehicle_df
vehicle_df.shape
# All the rows & features from original dataframe are copied into campaign_df
# Get more info on the datatypes of the columns
vehicle_df.info()
# There are 846 rows and 19 columns
# All of them are numeric columns except 'class' which is a string column
# Class is the target variable. Should be removed when PCA is done
vehicle_df.describe()
vehicle_df.head()
# Check for duplicate data
dups = vehicle_df.duplicated()
print('Number of duplicate rows = %d' % (dups.sum()))
vehicle_df[dups]
# To check for duplicates by column
#pd.concat(g for _, g in vehicle_df.groupby("col_name") if len(g) > 1).count()
# Remove duplicate rows
#print('Number of rows before discarding duplicates = %d' % (vehicle_df.shape[0]))
#vehicle_df.drop_duplicates(subset = None, keep = 'first', inplace=True)
#print('Number of rows after discarding duplicates = %d' % (vehicle_df.shape[0]))
# Lets look at the target column - 'class'
vehicle_df.groupby('class')['class'].count()
# There are only three types in class column with 'car' having the most occurences
# The ratio of bus:car:van = 0.257 : 0.507 : 0.235
# No missing values
# Use LabelEncoder to change labels to numeric values before proceeding with further analysis
# class - Categorical
plt.figure(1, figsize=(10,5))
plt.subplot(121)
vehicle_df['class'].value_counts().plot(kind='pie')
plt.subplot(122)
sns.countplot(vehicle_df['class'])
# Encode class variable to numeric labels
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
vehicle_df['class'] = le.fit_transform(vehicle_df['class'])
vehicle_df.head()
# class column has been encoded to numeric values
#bus=0
#car=1
#van=2
# Create seperate dataframes for each class of vehicles for further comparison
# Bus
bus_df = vehicle_df[vehicle_df['class'] == 0]
# Car
car_df = vehicle_df[vehicle_df['class'] == 1]
# Van
van_df = vehicle_df[vehicle_df['class'] == 2]
# Check for missing values
vehicle_df.isnull().sum()
# There are some missing values in several columns as indicated below
# Function for filling in missing values
def impute_missing_vals(df):
for col in df.columns:
print('Imputing missing values for column:', col)
# Get median value for a given col and class of vehicle
bus_med_val = bus_df[col].median()
car_med_val = car_df[col].median()
van_med_val = van_df[col].median()
print('Bus:', bus_med_val)
print('Car:', car_med_val)
print('Van:', van_med_val)
# Fill missing value with the median of that col for that class of vehicles
df.loc[(pd.isna(df[col])) & (df['class']==0), col] = bus_med_val
df.loc[(pd.isna(df[col])) & (df['class']==1), col] = car_med_val
df.loc[(pd.isna(df[col])) & (df['class']==2), col] = van_med_val
# Impute missing value with the median of that col for that class of vehicles
impute_missing_vals(vehicle_df)
vehicle_df.head(20)
# Check for missing values
vehicle_df.isnull().sum()
# Distributions for each of the numeric columns
vehicle_df.hist(figsize=(20,15))
# Columns with almost normal distributions - circularity, compactness, hollows ratio, max length rectangularity, radius ratio, scaled_radius_of_gyration, skewness_about.2
# Columns with right skew, indicating possible presence of outliers - pr.axis_rectangularity, pr.axis_aspect_ratio, scaled_radius_of_gyration.1, radius_ratio, scaled_variance, scaled_variance.1, skewness_about, skewness_about.1
# Columns with left skew - distance_circularity
# Define function to identify outliers given the dataframe and col
def identify_outliers(df, col):
Q1 = df[col].quantile(q=0.25)
Q3 = df[col].quantile(q=0.75)
IQR = Q3-Q1
print('Column:', col)
print('IQR:', IQR)
print('Lower Limit -', Q1 - 1.5*IQR)
print('Upper Limit -', Q3 + 1.5*IQR)
outliers = df[(df[col] < (Q1 - 1.5*IQR)) | (df[col] > (Q3 + 1.5*IQR))]
return outliers
# compactness - Numerical/Continuous
plt.figure(1, figsize=(16,5))
plt.subplot(131)
sns.distplot(vehicle_df['compactness'])
plt.axvline(vehicle_df['compactness'].mean(), color='k', linestyle='dashed', linewidth=1)
plt.subplot(132)
sns.boxplot(x=vehicle_df['class'], y=vehicle_df['compactness'], orient='vert')
plt.subplot(133)
# Look at the Probability Density Function
sns.distplot(bus_df['compactness'], kde=True, hist=False, label='Bus')
sns.distplot(car_df['compactness'], kde=True, hist=False, label='Car')
sns.distplot(van_df['compactness'], kde=True, hist=False, label='Van')
# Distribution is normal.
# The medians for compactness is higher than vans and bus, which are almost same.
# There are no outliers
# There is high probability that if the vehicle compactness is between 80 - 100, the vehicle is a van.
# circularity - Numerical/Continuous
plt.figure(1, figsize=(16,7))
plt.subplot(221)
sns.distplot(vehicle_df['circularity'])
plt.axvline(vehicle_df['circularity'].mean(), color='k', linestyle='dashed', linewidth=1)
plt.subplot(223)
sns.boxplot(vehicle_df['circularity'], orient='h')
plt.subplot(224)
sns.boxplot(x=vehicle_df['class'], y=vehicle_df['circularity'], orient='vert')
plt.subplot(222)
# Look at the Probability Density Function
sns.distplot(bus_df['circularity'], kde=True, hist=False, label='Bus')
sns.distplot(car_df['circularity'], kde=True, hist=False, label='Car')
sns.distplot(van_df['circularity'], kde=True, hist=False, label='Van')
# Distribution is normal.
# There are no outliers in the whole column but we see some outliers in the 'bus' category of vehicles.
# Cars are more circular, followed by bus and then van.
# distance_circularity - Numerical/Continuous
plt.figure(1, figsize=(16,7))
plt.subplot(221)
sns.distplot(vehicle_df['distance_circularity'])
plt.axvline(vehicle_df['distance_circularity'].mean(), color='k', linestyle='dashed', linewidth=1)
plt.subplot(224)
sns.boxplot(x=vehicle_df['class'], y=vehicle_df['distance_circularity'], orient='v')
plt.subplot(223)
sns.boxplot(vehicle_df['distance_circularity'], orient='h')
plt.subplot(222)
# Look at the Probability Density Function
sns.distplot(bus_df['distance_circularity'], kde=True, hist=False, label='Bus')
sns.distplot(car_df['distance_circularity'], kde=True, hist=False, label='Car')
sns.distplot(van_df['distance_circularity'], kde=True, hist=False, label='Van')
# Distribution has two gaussians, with peaks at 70 and 105.
# Also 2 gaussian when density is plotted indicating that 2 clusters of vehicles may be present in each class
# There are no outliers in the whole column, but we see outliers in the 'bus' category of vehicles.
# The median for car is much higher than van or bus.
# The density plots are overlapping for each vehicle type, indicating that this column may not be of much help in segregating the vehicles
# radius_ratio - Numerical/Continuous
plt.figure(1, figsize=(16,7))
plt.subplot(221)
sns.distplot(vehicle_df['radius_ratio'])
plt.axvline(vehicle_df['radius_ratio'].mean(), color='k', linestyle='dashed', linewidth=1)
plt.subplot(223)
sns.boxplot(vehicle_df['radius_ratio'], orient='h')
plt.subplot(224)
sns.boxplot(x=vehicle_df['class'], y=vehicle_df['radius_ratio'], orient='vert')
plt.subplot(222)
# Look at the Probability Density Function
sns.distplot(bus_df['radius_ratio'], kde=True, hist=False, label='Bus')
sns.distplot(car_df['radius_ratio'], kde=True, hist=False, label='Car')
sns.distplot(van_df['radius_ratio'], kde=True, hist=False, label='Van')
# Distribution is mostly normal with two gaussians, mean at ~170.
# There are outliers in the 'van' category of vehicles.
# The radius_ratio median for car is high, followed by bus and then van.
# The density plots are overlapping for each vehicle type
# Identify outliers
outliers = identify_outliers(vehicle_df, 'radius_ratio')
outliers
# The values which are identified as outliers are natural. Hence no need to remove them.
# pr.axis_aspect_ratio - Numerical/Continuous
plt.figure(1, figsize=(16,7))
plt.subplot(221)
sns.distplot(vehicle_df['pr.axis_aspect_ratio'])
plt.axvline(vehicle_df['pr.axis_aspect_ratio'].mean(), color='k', linestyle='dashed', linewidth=1)
plt.subplot(223)
sns.boxplot(vehicle_df['pr.axis_aspect_ratio'], orient='h')
plt.subplot(224)
sns.boxplot(x=vehicle_df['class'], y=vehicle_df['pr.axis_aspect_ratio'], orient='vert')
plt.subplot(222)
# Look at the Probability Density Function
sns.distplot(bus_df['pr.axis_aspect_ratio'], kde=True, hist=False, label='Bus')
sns.distplot(car_df['pr.axis_aspect_ratio'], kde=True, hist=False, label='Car')
sns.distplot(van_df['pr.axis_aspect_ratio'], kde=True, hist=False, label='Van')
print('Column axis_aspect_ratio skew:', vehicle_df['pr.axis_aspect_ratio'].skew())
# Distribution is right skewed, indicating presence of outliers.
# There are outliers in the 'bus' & van' categories of vehicles.
# The axis_aspect_ratio median for bus is high, followed by car and then van.
# The density plots are overlapping for each vehicle type.
# Identify outliers
outliers = identify_outliers(vehicle_df, 'pr.axis_aspect_ratio')
outliers
# vehicle_df.loc[vehicle_df['pr.axis_aspect_ratio'] > 77, 'pr.axis_aspect_ratio'] = 77
# The values which are identified as outliers are natural. Hence no need to remove them.
# They may form a cluster of their own, so will not alter them.
# max.length_aspect_ratio - Numerical/Discrete
plt.figure(1, figsize=(16,7))
plt.subplot(221)
sns.distplot(vehicle_df['max.length_aspect_ratio'])
plt.axvline(vehicle_df['max.length_aspect_ratio'].mean(), color='k', linestyle='dashed', linewidth=1)
plt.subplot(223)
sns.boxplot(vehicle_df['max.length_aspect_ratio'], orient='h')
plt.subplot(224)
sns.boxplot(x=vehicle_df['class'], y=vehicle_df['max.length_aspect_ratio'], orient='vert')
plt.subplot(222)
# Look at the Probability Density Function
sns.distplot(bus_df['max.length_aspect_ratio'], kde=True, hist=False, label='Bus')
sns.distplot(car_df['max.length_aspect_ratio'], kde=True, hist=False, label='Car')
sns.distplot(van_df['max.length_aspect_ratio'], kde=True, hist=False, label='Van')
print('Column max.length_aspect_ratio skew:', vehicle_df['max.length_aspect_ratio'].skew())
# Distribution is right skewed, indicating presence of outliers, bi-modal.
# There are outliers in the 'bus' & van' categories of vehicles.
# The length_aspect_ratio median for car is high, followed by car and then van.
# The density plots are overlapping for each vehicle type.
# Majority of the data lies between 0 & 15
sns.distplot(vehicle_df[(vehicle_df['max.length_aspect_ratio']>0) & (vehicle_df['max.length_aspect_ratio']<15)]['max.length_aspect_ratio'])
# Identify outliers
outliers = identify_outliers(vehicle_df, 'max.length_aspect_ratio')
outliers
# The values which are identified as outliers are natural. Hence no need to remove them.
# They may form a cluster of their own, so will not alter them.
# Impute the lower outlier with the LowerLimit value as it looks like an anamoly
vehicle_df.loc[vehicle_df['max.length_aspect_ratio'] == 2, 'max.length_aspect_ratio'] = 2.5
# Verified that it has been imputed
# scatter_ratio - Numerical/Continuous
plt.figure(1, figsize=(16,7))
plt.subplot(221)
sns.distplot(vehicle_df['scatter_ratio'])
plt.axvline(vehicle_df['scatter_ratio'].mean(), color='k', linestyle='dashed', linewidth=1)
plt.subplot(223)
sns.boxplot(vehicle_df['scatter_ratio'], orient='h')
plt.subplot(224)
sns.boxplot(x=vehicle_df['class'], y=vehicle_df['scatter_ratio'], orient='vert')
plt.subplot(222)
# Look at the Probability Density Function
sns.distplot(bus_df['scatter_ratio'], kde=True, hist=False, label='Bus')
sns.distplot(car_df['scatter_ratio'], kde=True, hist=False, label='Car')
sns.distplot(van_df['scatter_ratio'], kde=True, hist=False, label='Van')
print('Column scatter_ratio skew:', vehicle_df['scatter_ratio'].skew())
# Distribution is mostly normal, bi-modal.
# There are outliers in the col as a wholes, but we see some outliers in 'bus' categories of vehicles. No need to impute.
# The scatter_ratio median for car is high, followed by bus and then van.
# The density plots are mostly overlapping for each vehicle type.
# elongatedness - Numerical/Continuous
plt.figure(1, figsize=(16,7))
plt.subplot(221)
sns.distplot(vehicle_df['elongatedness'])
plt.axvline(vehicle_df['elongatedness'].mean(), color='k', linestyle='dashed', linewidth=1)
plt.subplot(223)
sns.boxplot(vehicle_df['elongatedness'], orient='h')
plt.subplot(224)
sns.boxplot(x=vehicle_df['class'], y=vehicle_df['elongatedness'], orient='vert')
plt.subplot(222)
# Look at the Probability Density Function
sns.distplot(bus_df['elongatedness'], kde=True, hist=False, label='Bus')
sns.distplot(car_df['elongatedness'], kde=True, hist=False, label='Car')
sns.distplot(van_df['elongatedness'], kde=True, hist=False, label='Van')
print('Column elongatedness skew:', vehicle_df['elongatedness'].skew())
# Distribution is mostly normal, bi-modal.
# There are no outliers.
# The elongatedness median for van is high, followed by bus and then car.
# The density plots show clearly that elongatedness is less for car and more for bus and van.
# pr.axis_rectangularity - Numerical/Discrete
plt.figure(1, figsize=(16,7))
plt.subplot(221)
sns.distplot(vehicle_df['pr.axis_rectangularity'])
plt.axvline(vehicle_df['pr.axis_rectangularity'].mean(), color='k', linestyle='dashed', linewidth=1)
plt.subplot(223)
sns.boxplot(vehicle_df['pr.axis_rectangularity'], orient='h')
plt.subplot(224)
sns.boxplot(x=vehicle_df['class'], y=vehicle_df['pr.axis_rectangularity'], orient='vert')
plt.subplot(222)
# Look at the Probability Density Function
sns.distplot(bus_df['pr.axis_rectangularity'], kde=True, hist=False, label='Bus')
sns.distplot(car_df['pr.axis_rectangularity'], kde=True, hist=False, label='Car')
sns.distplot(van_df['pr.axis_rectangularity'], kde=True, hist=False, label='Van')
print('Column pr.axis_rectangularity skew:', vehicle_df['pr.axis_rectangularity'].skew())
# Distribution is right skewed, multiple gaussians.
# No outliers, but we see some present in bus category.
# The axis_rectangularity median for car is high, compared to bus and van.
# The density plots show clearly that axis_rectangularity for van is between 16 - 21, but more widespread for bus and car.
# max.length_rectangularity - Numerical/Continuous
plt.figure(1, figsize=(16,7))
plt.subplot(221)
sns.distplot(vehicle_df['max.length_rectangularity'])
plt.axvline(vehicle_df['max.length_rectangularity'].mean(), color='k', linestyle='dashed', linewidth=1)
plt.subplot(223)
sns.boxplot(vehicle_df['max.length_rectangularity'], orient='h')
plt.subplot(224)
sns.boxplot(x=vehicle_df['class'], y=vehicle_df['max.length_rectangularity'], orient='vert')
plt.subplot(222)
# Look at the Probability Density Function
sns.distplot(bus_df['max.length_rectangularity'], kde=True, hist=False, label='Bus')
sns.distplot(car_df['max.length_rectangularity'], kde=True, hist=False, label='Car')
sns.distplot(van_df['max.length_rectangularity'], kde=True, hist=False, label='Van')
print('Column max.length_rectangularity skew:', vehicle_df['max.length_rectangularity'].skew())
# Distribution is mostly normal, multiple gaussians.
# No Outliers, but some are seen in bus category.
# The length_rectangularity median for car is high, compared to bus and van.
# The density plots are overlapping.
# scaled_variance - Numerical/Continuous
plt.figure(1, figsize=(16,7))
plt.subplot(221)
sns.distplot(vehicle_df['scaled_variance'])
plt.axvline(vehicle_df['scaled_variance'].mean(), color='k', linestyle='dashed', linewidth=1)
plt.subplot(223)
sns.boxplot(vehicle_df['scaled_variance'], orient='h')
plt.subplot(224)
sns.boxplot(x=vehicle_df['class'], y=vehicle_df['scaled_variance'], orient='vert')
plt.subplot(222)
# Look at the Probability Density Function
sns.distplot(bus_df['scaled_variance'], kde=True, hist=False, label='Bus')
sns.distplot(car_df['scaled_variance'], kde=True, hist=False, label='Car')
sns.distplot(van_df['scaled_variance'], kde=True, hist=False, label='Van')
print('Column scaled_variance skew:', vehicle_df['scaled_variance'].skew())
# Distribution is right skewed, multiple gaussians.
# Outliers are present in bus and van categories.
# The scaled_variance median for car is high, followed by bus and then van.
# The density plots are overlapping.
# Identify outliers
outliers = identify_outliers(vehicle_df, 'scaled_variance')
outliers
# The values which are identified as outliers are natural. Hence no need to remove them.
# They may form a cluster of their own, so will not alter them.
# Impute the single outlier with the UpperLimit value as it is very far from the other data
vehicle_df.loc[vehicle_df['scaled_variance'] == 320.0, 'scaled_variance'] = 292
# Verified that it has been imputed
# scaled_radius_of_gyration - Numerical/Continuous
plt.figure(1, figsize=(16,7))
plt.subplot(221)
sns.distplot(vehicle_df['scaled_radius_of_gyration'])
plt.axvline(vehicle_df['scaled_radius_of_gyration'].mean(), color='k', linestyle='dashed', linewidth=1)
plt.subplot(223)
sns.boxplot(vehicle_df['scaled_radius_of_gyration'], orient='h')
plt.subplot(224)
sns.boxplot(x=vehicle_df['class'], y=vehicle_df['scaled_radius_of_gyration'], orient='vert')
plt.subplot(222)
# Look at the Probability Density Function
sns.distplot(bus_df['scaled_radius_of_gyration'], kde=True, hist=False, label='Bus')
sns.distplot(car_df['scaled_radius_of_gyration'], kde=True, hist=False, label='Car')
sns.distplot(van_df['scaled_radius_of_gyration'], kde=True, hist=False, label='Van')
print('Column scaled_radius_of_gyration skew:', vehicle_df['scaled_radius_of_gyration'].skew())
# Distribution is mostly normal, multiple gaussians.
# No Outliers, but some are present in bus category.
# The median for car is high, followed by bus and then van.
# The density plots are overlapping.
# scaled_radius_of_gyration.1 - Numerical/Continuous
plt.figure(1, figsize=(16,7))
plt.subplot(221)
sns.distplot(vehicle_df['scaled_radius_of_gyration.1'])
plt.axvline(vehicle_df['scaled_radius_of_gyration.1'].mean(), color='k', linestyle='dashed', linewidth=1)
plt.subplot(223)
sns.boxplot(vehicle_df['scaled_radius_of_gyration.1'], orient='h')
plt.subplot(224)
sns.boxplot(x=vehicle_df['class'], y=vehicle_df['scaled_radius_of_gyration.1'], orient='vert')
plt.subplot(222)
# Look at the Probability Density Function
sns.distplot(bus_df['scaled_radius_of_gyration.1'], kde=True, hist=False, label='Bus')
sns.distplot(car_df['scaled_radius_of_gyration.1'], kde=True, hist=False, label='Car')
sns.distplot(van_df['scaled_radius_of_gyration.1'], kde=True, hist=False, label='Van')
print('Column scaled_radius_of_gyration.1 skew:', vehicle_df['scaled_radius_of_gyration.1'].skew())
# Distribution is right skewed, multiple gaussians.
# Outliers are present in all categories.
# The median for bus is high, followed by van and then bus.
# The density plots are overlapping.
# Identify outliers
outliers = identify_outliers(vehicle_df, 'scaled_radius_of_gyration.1')
outliers
# The values which are identified as outliers are natural. Hence no need to remove them.
# They may form a cluster of their own, so will not alter them.
# skewness_about - Numerical/Discrete
plt.figure(1, figsize=(16,7))
plt.subplot(221)
sns.distplot(vehicle_df['skewness_about'])
plt.axvline(vehicle_df['skewness_about'].mean(), color='k', linestyle='dashed', linewidth=1)
plt.subplot(223)
sns.boxplot(vehicle_df['skewness_about'], orient='h')
plt.subplot(224)
sns.boxplot(x=vehicle_df['class'], y=vehicle_df['skewness_about'], orient='vert')
plt.subplot(222)
# Look at the Probability Density Function
sns.distplot(bus_df['skewness_about'], kde=True, hist=False, label='Bus')
sns.distplot(car_df['skewness_about'], kde=True, hist=False, label='Car')
sns.distplot(van_df['skewness_about'], kde=True, hist=False, label='Van')
print('Column skewness_about skew:', vehicle_df['skewness_about'].skew())
# Distribution is right skewed.
# Outliers are present in bus and van categories.
# The median for bus is lower than van and car.
# The density plots are overlapping.
# Identify outliers
outliers = identify_outliers(vehicle_df, 'skewness_about')
outliers
# The values which are identified as outliers are natural. Hence no need to remove them.
# They may form a cluster of their own, so will not alter them.
# skewness_about.1 - Numerical/Discrete
plt.figure(1, figsize=(16,7))
plt.subplot(221)
sns.distplot(vehicle_df['skewness_about.1'])
plt.axvline(vehicle_df['skewness_about.1'].mean(), color='k', linestyle='dashed', linewidth=1)
plt.subplot(223)
sns.boxplot(vehicle_df['skewness_about.1'], orient='h')
plt.subplot(224)
sns.boxplot(x=vehicle_df['class'], y=vehicle_df['skewness_about.1'], orient='vert')
plt.subplot(222)
# Look at the Probability Density Function
sns.distplot(bus_df['skewness_about.1'], kde=True, hist=False, label='Bus')
sns.distplot(car_df['skewness_about.1'], kde=True, hist=False, label='Car')
sns.distplot(van_df['skewness_about.1'], kde=True, hist=False, label='Van')
print('Column skewness_about.1 skew:', vehicle_df['skewness_about.1'].skew())
# Distribution is right skewed.
# Outliers are present in bus and van categories.
# The median for bus and van are equal but lower than car.
# The density plots are overlapping, but cars are clearly distingishable after 35.
# Identify outliers
outliers = identify_outliers(vehicle_df, 'skewness_about.1')
outliers
# The single identified outliers is just above the upper limit, hence no need to remove it.
# skewness_about.2 - Numerical/Discrete
plt.figure(1, figsize=(16,7))
plt.subplot(221)
sns.distplot(vehicle_df['skewness_about.2'])
plt.axvline(vehicle_df['skewness_about.2'].mean(), color='k', linestyle='dashed', linewidth=1)
plt.subplot(223)
sns.boxplot(vehicle_df['skewness_about.2'], orient='h')
plt.subplot(224)
sns.violinplot(x=vehicle_df['class'], y=vehicle_df['skewness_about.2'], orient='vert')
plt.subplot(222)
# Look at the Probability Density Function
sns.distplot(bus_df['skewness_about.2'], kde=True, hist=False, label='Bus')
sns.distplot(car_df['skewness_about.2'], kde=True, hist=False, label='Car')
sns.distplot(van_df['skewness_about.2'], kde=True, hist=False, label='Van')
print('Column skewness_about.2 skew:', vehicle_df['skewness_about.2'].skew())
# Distribution is almost normal.
# No Outliers, but some are present in car category.
# The median for car is higher than van and bus.
# The density plots are overlapping.
# hollows_ratio - Numerical/Discrete
plt.figure(1, figsize=(16,7))
plt.subplot(221)
sns.distplot(vehicle_df['hollows_ratio'])
plt.axvline(vehicle_df['hollows_ratio'].mean(), color='k', linestyle='dashed', linewidth=1)
plt.subplot(223)
sns.boxplot(vehicle_df['hollows_ratio'], orient='h')
plt.subplot(224)
sns.boxplot(x=vehicle_df['class'], y=vehicle_df['hollows_ratio'], orient='vert')
plt.subplot(222)
# Look at the Probability Density Function
sns.distplot(bus_df['hollows_ratio'], kde=True, hist=False, label='Bus')
sns.distplot(car_df['hollows_ratio'], kde=True, hist=False, label='Car')
sns.distplot(van_df['hollows_ratio'], kde=True, hist=False, label='Van')
print('Column hollows_ratio skew:', vehicle_df['hollows_ratio'].skew())
# Distribution has a negative skew, bi-modal.
# No Outliers, but some are present in car category.
# The median for car is high followed by van and then bus.
# The density plots are overlapping.
# Heatmap to see correlation between each and every column
plt.subplots(figsize = (15,15))
sns.heatmap(vehicle_df.corr(), annot=True)
# Class variable shows high correlation with the below variables
# High Positive corr - elongatedness, hollows_ratio, max.length_aspect_ratio
# High Negative corr - scaled_variance, scatter_ratio, scaled_variance.1, pr.axis_rectangularity
# Use pairplot() to plot pairwise relationships between all columns in the dataset
sns.pairplot(vehicle_df, hue="class", palette="muted")
# There are several features which are exhibiting high correlation between each other.
# So performing PCA will help reduce dimensionality by removing the highly related ones.
# Elongatedness, scaled_variance and scatter_ratio seem to be important classifiers
# Separate the independent attributes from the target variable
X = vehicle_df.drop(columns=['class'], axis=1)
y = pd.DataFrame(vehicle_df['class'])
print('X:', X.shape)
print('y:', y.shape)
# Create the training and test data set in the ratio of 75:25 respectively
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)
print('X_train:', X_train.shape)
print('X_test:', X_test.shape)
print('y_train:', y_train.shape)
print('y_test:', y_test.shape)
# Create logistic regression object
reg = LogisticRegression()
# Train the model using the training set with reduced dimensions
reg.fit(X_train, y_train)
# Making predictions on the testing set
y_pred = reg.predict(X_test)
# Comparing actual response values (y_test) with predicted response values (y_pred)
LR_acc = round(accuracy_score(y_test, y_pred), 2)*100
print("Logistic Regression model accuracy(in %):", LR_acc)
X_train.head()
# # Since the dimensions of the data are different, we will standardize the data using the Standard Scaler
from sklearn.preprocessing import StandardScaler
# Fit the scaling on the training data
sc = StandardScaler()
sc.fit(X_train)
# Transform both the train and test data
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)
X_train = pd.DataFrame(X_train, columns=['compactness', 'circularity', 'distance_circularity', 'radius_ratio',
'pr.axis_aspect_ratio', 'max.length_aspect_ratio', 'scatter_ratio',
'elongatedness', 'pr.axis_rectangularity', 'max.length_rectangularity',
'scaled_variance', 'scaled_variance.1', 'scaled_radius_of_gyration',
'scaled_radius_of_gyration.1', 'skewness_about', 'skewness_about.1',
'skewness_about.2', 'hollows_ratio'])
X_test = pd.DataFrame(X_test, columns=['compactness', 'circularity', 'distance_circularity', 'radius_ratio',
'pr.axis_aspect_ratio', 'max.length_aspect_ratio', 'scatter_ratio',
'elongatedness', 'pr.axis_rectangularity', 'max.length_rectangularity',
'scaled_variance', 'scaled_variance.1', 'scaled_radius_of_gyration',
'scaled_radius_of_gyration.1', 'skewness_about', 'skewness_about.1',
'skewness_about.2', 'hollows_ratio'])
X_test.head()
# All the columns are in the same scale. We get the same results using the zscore functions
# Create logistic regression object
reg = LogisticRegression()
# Train the model using the training set with reduced dimensions
reg.fit(X_train, y_train)
# Making predictions on the testing set
y_pred = reg.predict(X_test)
# Comparing actual response values (y_test) with predicted response values (y_pred)
LR_acc = round(accuracy_score(y_test, y_pred), 2)*100
print("Logistic Regression model accuracy(in %):", LR_acc)
# PCA
# Step 1 - Create covariance matrix
# cov_matrix = np.cov(vehicle_df_z.T)
cov_matrix = np.cov(X_train.T)
print('Covariance Matrix \n%s', cov_matrix)
# Step 2 - Get eigen values and eigen vector
eig_vals, eig_vecs = np.linalg.eig(cov_matrix)
print('Eigen Vectors \n%s', eig_vecs)
print('\n Eigen Values \n%s', eig_vals)
tot = sum(eig_vals)
var_exp = [( i/tot ) * 100 for i in sorted(eig_vals, reverse=True)]
cum_var_exp = np.cumsum(var_exp)
print("Cumulative Variance Explained", cum_var_exp)
plt.plot(var_exp)
# Visually we can observe that there is a steep drop in variance explained with increase in number of PC's.
# With 7 components we can explain ~96% of variance
# With 10 components we can explain ~98.8% of variance
# With 11 components we can explain ~99% of variance
# We will proceed with 10 components here. But depending on requirement, 95% variation or 7 components will also do good
# Ploting
plt.figure(figsize=(10 , 5))
plt.bar(range(1, eig_vals.size + 1), var_exp, alpha = 0.5, align = 'center', label = 'Individual explained variance')
plt.step(range(1, eig_vals.size + 1), cum_var_exp, where='mid', label = 'Cumulative explained variance')
plt.ylabel('Explained Variance Ratio')
plt.xlabel('Principal Components')
plt.legend(loc = 'best')
plt.tight_layout()
plt.show()
# Using scikit learn to find PCA here. It does all the above steps and maps data to PCA dimensions in one shot
from sklearn.decomposition import PCA
# We are generating only 10 PCA dimensions (dimensionality reduction from 18 to 10)
pca = PCA(n_components=10)
# Fit and Transform using training data, reduce dimensionality to 10
train_reduced = pca.fit_transform(X_train)
train_reduced.transpose()
# Plot pair wise plots for the reduced number of dimesions
sns.pairplot(pd.DataFrame(train_reduced))
# We see that all the pair plots have cloud-like forms,
# which indicates that not only were the dimensions reduced, but the multicollinearity has alsobeen removed.
# Reduce the dimensionality of test data also to 10
test_reduced = pca.transform(X_test)
test_reduced.transpose()
pca.components_
df_comp = pd.DataFrame(pca.components_, columns=list(X_train))
df_comp.head()
plt.figure(figsize=(12,6))
sns.heatmap(df_comp,cmap='plasma',)
# This heatmap and the color bar basically represent the correlation between the various features and the principal component itself
# PC# 1 looks highly related to skewness_about.2 & hollows_ratio
# So does PC# 3 & skewness_about.1
# And PC# 9 & scaled_radius_of_gyration
# Create logistic regression object
reg = LogisticRegression()
# Train the model using the training set with reduced dimensions
reg.fit(train_reduced, y_train)
# Making predictions on the testing set
y_pred = reg.predict(test_reduced)
# Comparing actual response values (y_test) with predicted response values (y_pred)
LR_acc = round(accuracy_score(y_test, y_pred), 2)*100
print("Logistic Regression model accuracy(in %):", LR_acc)
# Conclusion
# As we can see, we were able to reduce complexity by reducing dimensions from 18 to 10, however had to compromise in accuracy.